Question 1

A

library(ggplot2)
library(tidyverse)

rm(list=ls())

# Define x
x = seq(0,1, length.out=100)

#x = rnorm(100,0,1)
# We choose equally spaced notes over the range [0,1]
knots = seq(0,1,length.out=8) # we set this to 8 as the first and last knots will be removed, so this ensures we are left with 6

# Basis functions per Woods formulation

b1 = function(x) {
  rep(1, length(x))
}

b2 = function(x){
  x
}

R_x_z = function(x, z) {
  ((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 - 
  ((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}

# Construct the basis matrix

basis_matrix = cbind(b1(x), b2(x),sapply(knots[-c(1,length(knots))],function(k) R_x_z(x,k)))

basis_design_matrix = basis_matrix %>% as_tibble()
                  
colnames(basis_design_matrix) = c("b1", "b2", "b3", "b4", paste0("b", 5:(4 + length(knots))))
basis_design_matrix = basis_design_matrix %>%
                       mutate(x = x)

basis_melted <- reshape2::melt(basis_design_matrix, id.vars = "x")

ggplot(basis_melted, aes(x = x, y = value, color = variable)) + 
  geom_line(size = 1) + 
  labs(title = "Wood Cubic Spline Basis",
       x = "x", y = "Basis Function Value", color = "Basis") +
  theme_minimal()

B

set.seed(123)
n <- 100
x = rnorm(n,0,1)

y <- 5 + sin(3 * pi * (x-0.6)) + rnorm(n, sd = 0.5^2)


knots <- seq(min(x), max(x), length.out = 32) # We define length of 32 because the first and last knots will be removed, to ensure we have exactly 30 knots

# Construct penalty matrix
knot_positions <- knots[-c(1, length(knots))]
n_knots = length(knot_positions)
S = matrix(0, n_knots, n_knots)
for(i in 1:n_knots){
  for (j in 1: n_knots){
    S[i,j] = R_x_z(knot_positions[i], knot_positions[j])
  }
}

penalized_regression_spline = function(lambda, S, knot_positions, y){
  basis_matrix <- cbind(
  b1(x),
  b2(x),
  sapply(knot_positions, function(k) R_x_z(x, k))
)
  P = lambda * S
  X  = basis_matrix[, 3:ncol(basis_matrix)]
  XtX_plus_p = t(X)%*%X + P 
  XtY = t(X)%*%y
  beta_hat = solve(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  H = X %*% solve(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values , se_errors = se_errors))
}

fit_result = penalized_regression_spline(0.00000001, S, knot_positions,y) # choose a random lambda value

plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, se_errors=fit_result$se_errors)

ggplot(plot_data, aes(x = x, y = y)) + 
  geom_point(color = 'black', alpha = 0.6) +
  geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
  labs(title = "Penalized Regression Spline Fit",
       x = "X",
       y = "Y") +
  theme_minimal()

C

# (95% confidence intervals)
upper <- fit_result$fitted_values + 1.96 * fit_result$se_errors
lower <- fit_result$fitted_values - 1.96 * fit_result$se_errors


plot_data <- data.frame(x = x, y = y, y_hat = fit_result$fitted_values, lower = lower, upper = upper)

ggplot(plot_data, aes(x = x, y = y)) + 
  geom_point(color = 'black', alpha = 0.6) +
  geom_line(aes(y = y_hat), color = 'blue', size = 1.2) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.1, fill = 'blue') +
  labs(title = "Penalized Regression Spline Fit with Confidence Bands",
       x = "X",
       y = "Y") +
  theme_minimal()

D

compute_gcv = function(lambda){
  basis_matrix <- cbind(
  b1(x),
  b2(x),
  sapply(knot_positions, function(k) R_x_z(x, k))
)
  P = lambda * S
  X  = basis_matrix[, 3:ncol(basis_matrix)]
  XtX_plus_p = t(X)%*%X + P 
  XtY = t(X)%*%y
  beta_hat = solve(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  
  # Compute residuals
  residuals = y - fitted_values
  H = X %*% solve(XtX_plus_p)%*%t(X)
  edf = sum(diag(H))
  
  # Compute the GCV score
  n = length(y)
  gcv = sum(residuals^2)/(n*(1-edf/n)^2)
  return (gcv)
}

lambda_values <- seq(0.00000001, 1, length.out = 5000)
gcv_scores <- sapply(lambda_values, compute_gcv)

gcv_data <- data.frame(lambda = lambda_values, gcv = gcv_scores)

ggplot(gcv_data, aes(x = lambda, y = gcv)) +
  geom_line(color = 'blue', size = 1.2) +
  labs(title = "GCV as a Function of the Smoothing Parameter",
       x = "Smoothing Parameter (lambda)",
       y = "GCV Score") +
  theme_minimal()

Question 2

Tensor Product Spline

library(gamair)
library(tidyverse)
library(dplyr)

rm(list=ls())

# Basis functions per Woods formulation

b1 = function(x) {
  rep(1, length(x))
}

b2 = function(x){
  x
}

R_x_z = function(x, z) {
  ((z - 0.5)^2 - 1/12) * ((x - 0.5)^2 - 1/12) / 4 - 
  ((abs(x - z) - 0.5)^4 - 0.5 * (abs(x - z) - 0.5)^2 + 7/240) / 24
}

data("brain")

brain_data = brain%>%as_tibble()%>%
             select(X,Y, medFPQ)

tensor_product_spline = function(x,y, response,x_knot_positions, y_knot_positions, x_penalty_matrix, y_penalty_matrix, lambda, penalize=FALSE) {
  
  # Scale x and y
  x_scaled = scale(x)
  y_scaled = scale(y)
  
  y_data = as.matrix(response)
  x_basis_matrix <- cbind(sapply(x_knot_positions, function(k) R_x_z(x_scaled, k)))
  y_basis_matrix <- cbind(sapply(y_knot_positions, function(k) R_x_z(y_scaled, k)))
  
  tensor_basis_matrix = compute_tensor_product(x_basis_matrix, y_basis_matrix)
  tensor_penalty_matrix = compute_tensor_product(x_penalty_matrix, y_penalty_matrix)
  
  P = lambda * tensor_penalty_matrix
  X  = tensor_basis_matrix[1:nrow(y_data),]
  XtX_plus_p = t(X)%*%X + P 
  
  XtY = t(X)%*%y_data
  beta_hat = MASS::ginv(XtX_plus_p)%*%XtY
  fitted_values = X %*% beta_hat
  H = X %*% MASS::ginv(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values , se_errors = se_errors))
}

compute_tensor_product = function(A, B){
  m1 = ncol(A)
  m2 = ncol(B)
  G = matrix(NA, nrow=nrow(A)*nrow(B), ncol = m1 * m2 )
  ccol <- 1
  for (j in 1:m1) {
    for (k in 1:m2) {
      G[, ccol] <- A[, j] * B[, k]
      ccol <- ccol + 1
    }
  }
  return (G)
}

x_knots <- seq(min(brain_data$X), max(brain$X), length.out = 10) 
y_knots <- seq(min(brain_data$Y), max(brain$Y), length.out = 10) 

# Construct penalty matrix
construct_penalty_matrix <- function(knots) {
  n_knots <- length(knots)
  S <- matrix(0, n_knots, n_knots)
  for (i in 1:n_knots) {
    for (j in 1:n_knots) {
      S[i, j] <- R_x_z(knots[i], knots[j])
    }
  }
  S
}
S_x = construct_penalty_matrix(x_knots)
S_y = construct_penalty_matrix(y_knots)

tensor_product_spline_result = tensor_product_spline(
  brain_data$X, 
  brain_data$Y, 
  brain_data$medFPQ,
  x_knots, 
  y_knots,
  S_x,
  S_y,
  0.00001
)

# Visualize the original data
ggplot(brain_data, aes(x = X, y = Y)) +
  geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
  scale_color_viridis_c() +
  labs(title = "Original Data",
       x = "X",
       y = "Y",
       color = "Observed medFPQ") +
  theme_minimal()

brain_data <- brain_data %>% mutate(Fitted = tensor_product_spline_result$fitted_values)

ggplot(brain_data, aes(x = X, y = Y)) +
  geom_tile(aes(fill = Fitted)) +
  scale_fill_viridis_c() +
  labs(title = "Tensor Product Spline Fit",
       x = "X",
       y = "Y",
       fill = "Fitted medFPQ") +
  theme_minimal()

# Visualize the observed medFPQ values
ggplot(brain_data, aes(x = X, y = Y)) +
  geom_point(aes(color = medFPQ), size = 2, alpha = 0.6) +
  scale_color_viridis_c() +
  labs(title = "Observed medFPQ",
       x = "X",
       y = "Y",
       color = "Observed medFPQ") +
  theme_minimal()

plot_data <- data.frame(
  X = brain_data$X,
  Y = brain_data$Y,
  Original = brain_data$medFPQ,
  Fitted = tensor_product_spline_result$fitted_values
)

plot_data_long <- tidyr::pivot_longer(plot_data, 
                                      cols = c(Original, Fitted),
                                      names_to = "Type",
                                      values_to = "Value")

plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))

# Create the plot
ggplot(plot_data_long, aes(x = X, y = Y, fill = Value)) +
  geom_tile() +
  facet_wrap(~ Type) +
  scale_fill_viridis_c() +
  labs(title = "Tensor Product Original vs Fitted Values",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "medFPQ") +
  theme_minimal() +
  theme(aspect.ratio = 1)

Thin Plate Spline

library(gamair)
library(tidyverse)
library(dplyr)

rm(list=ls())

data("brain")

brain_data = brain %>% as_tibble() %>%
             select(X, Y, medFPQ)

# Radial Basis function
tps_basis = function(r){
  ifelse(r > 0, r^2 * log(r), 0)
}

create_design_matrix = function(data, knots){
  n = nrow(data)
  m = nrow(knots)
  # Create matrices of coordinates
  data_mat <- as.matrix(data[, 1:2])
  knots_mat <- as.matrix(knots[, 1:2])
  
  # Compute pairwise distances
  distances <- fields::rdist(data_mat, knots_mat)
  
  # Apply the thin-plate spline basis function to all distances at once
  B <- apply(distances, c(1,2), tps_basis)
  
  A = cbind(1, data)
  X = cbind(A, B)
  return (list(X = X, A = A, B = B))
}

thin_plate_spline = function(data, lambda, num_knots){
  n = nrow(data)
  knot_indices = seq(1, n, length.out = num_knots) 
  knots = data[knot_indices, 1:2]
  
  # Create design matrix
  design_matrix = create_design_matrix(data[,1:2], knots)
  X = as.matrix(design_matrix$X)
  A = design_matrix$A
  B = design_matrix$B
  
  # Create penalty matrix
  m = ncol(B)
  P = rbind(
    cbind(matrix(0, 3, 3), matrix(0, 3, m)),
    cbind(matrix(0, m, 3), B[knot_indices,])
  )
  y = as.matrix(data[, 3])
  XtX_plus_p = t(X) %*% X + (lambda * P)
  XtY = t(X) %*% y
  beta_hat = solve(XtX_plus_p) %*% XtY
  fitted_values = X %*% beta_hat
  colnames(fitted_values) = "Fitted"
  H = X %*% solve(XtX_plus_p) %*% t(X)
  se_errors = sqrt(diag(H))
  return (list(fitted_values = fitted_values, se_errors = se_errors))
}

tps_result = thin_plate_spline(brain_data, 0.5, 10)

head(tps_result$fitted_values)
##         Fitted
## [1,] 1.4844050
## [2,] 1.2056177
## [3,] 1.3681456
## [4,] 1.4847700
## [5,] 1.5476221
## [6,] 0.9391614
# Visualize the fitted thin-plate spline without the original data
brain_data <- brain_data %>% mutate(Fitted = tps_result$fitted_values)

ggplot(brain_data, aes(x = X, y = Y)) +
  geom_tile(aes(fill = Fitted)) +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline Fit",
       x = "X",
       y = "Y",
       fill = "Fitted medFPQ") +
  theme_minimal()

# Reshape the data for plotting
plot_data <- data.frame(
  X = brain_data$X,
  Y = brain_data$Y,
  Original = brain_data$medFPQ,
  Fitted = tps_result$fitted_values
)

plot_data_long <- tidyr::pivot_longer(plot_data, cols = c(Original, Fitted), names_to = "Type", values_to = "medFPQ")

# Change the order of the factors in the Type column to show "Original" first
plot_data_long$Type <- factor(plot_data_long$Type, levels = c("Original", "Fitted"))

p = ggplot(plot_data_long, aes(x = X, y = Y, fill = medFPQ)) +
  geom_tile() +
  facet_wrap(~ Type) +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline: Original vs Fitted Values",
       x = "X", 
       y = "Y", 
       fill = "medFPQ") +
  theme_minimal() +
  theme(aspect.ratio = 1)

print(p)

ggsave("tps_original_vs_fitted.png", p, width = 12, height = 6)

Tensor Product and Thin Plate Spline: Using mgcv function

# Load necessary libraries
library(gamair)
library(mgcv)
library(tidyverse)
library(ggplot2)

# Load the data
data(brain)
brain_data <- brain %>%
  as_tibble() %>%
  select(X, Y, medFPQ)

# Fit the tensor-product spline model using mgcv
tp_model <- gam(medFPQ ~ te(X, Y, k = 10), data = brain_data)

# Fit the thin-plate spline model using mgcv
tps_model <- gam(medFPQ ~ s(X, Y, bs = "tp", k = 100), data = brain_data)

# Summarize the models
summary(tp_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medFPQ ~ te(X, Y, k = 10)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.24791    0.03003   41.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##           edf Ref.df     F p-value    
## te(X,Y) 90.09  91.84 9.934  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.347   Deviance explained = 38.4%
## GCV = 1.5007  Scale est. = 1.4135    n = 1567
summary(tps_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## medFPQ ~ s(X, Y, bs = "tp", k = 100)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.24791    0.03029   41.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(X,Y) 86.49  96.33 8.503  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.335   Deviance explained = 37.2%
## GCV = 1.5232  Scale est. = 1.4381    n = 1567
# Predict using the mgcv models
brain_data$Fitted_tp <- predict(tp_model, newdata = brain_data)
brain_data$Fitted_tps <- predict(tps_model, newdata = brain_data)

# Plot the original data
original_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = medFPQ)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Original Data",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "medFPQ") +
  theme_minimal()

# Plot the tensor-product spline fitted data
tp_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tp)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Tensor-Product Spline Fit using mgcv",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "Fitted medFPQ") +
  theme_minimal()

# Plot the thin-plate spline fitted data
tps_plot <- ggplot(brain_data, aes(x = X, y = Y, fill = Fitted_tps)) +
  geom_tile() +
  scale_fill_viridis_c() +
  labs(title = "Thin-Plate Spline Fit using mgcv",
       x = "X coordinate",
       y = "Y coordinate",
       fill = "Fitted medFPQ") +
  theme_minimal()

# Save the plots
ggsave("original_plot.png", original_plot, width = 8, height = 6)
ggsave("tp_plot.png", tp_plot, width = 8, height = 6)
ggsave("tps_plot.png", tps_plot, width = 8, height = 6)

Question 3: GAMs and MARS

GAM and Logistic Regression

library(mgcv)
rm(list = ls())
pulsar_data = read.csv2("Pulsar.csv", header = TRUE, sep=",")
head(pulsar_data)
##   Mean_Integrated          SD           EK     Skewness Mean_DMSNR_Curve
## 1        140.5625 55.68378214 -0.234571412 -0.699648398      3.199832776
## 2     102.5078125 58.88243001  0.465318154 -0.515087909      1.677257525
## 3      103.015625 39.34164944  0.323328365  1.051164429      3.121237458
## 4          136.75 57.17844874 -0.068414638 -0.636238369      3.642976589
## 5      88.7265625 40.67222541  0.600866079  1.123491692      1.178929766
## 6      93.5703125 46.69811352   0.53190485  0.416721117      1.636287625
##   SD_DMSNR_Curve EK_DMSNR_Curve Skewness_DMSNR_Curve Class
## 1    19.11042633    7.975531794          74.24222492     0
## 2    14.86014572    10.57648674          127.3935796     0
## 3    21.74466875    7.735822015          63.17190911     0
## 4     20.9592803     6.89649891          53.59366067     0
## 5     11.4687196    14.26957284          252.5673058     0
## 6    14.54507425     10.6217484          131.3940043     0
# Split the data into training and test sets (80% training, 20% test)
set.seed(10032024)
sample <- sample.int(n = nrow(pulsar_data), size = floor(.8*nrow(pulsar_data)), replace = F)
train <- pulsar_data[sample, ]
test <- pulsar_data[-sample, ]

# Separate features and target variable
X_train <- train[, !names(train) %in% "Class"]
y_train <- train$Class
X_test <- test[, !names(test) %in% "Class"]
y_test <- test$Class

# Fit Logistic Regession Model (GLM)

#log_regression_model = glm(Class ~., data = train, family = binomial)


#gam_model <- gam(Class ~ s(Mean_Integrated) + s(SD) + s(EK) + s(Skewness) + s(Mean_DMSNR_Curve) + s(SD_DMSNR_Curve) + #s(EK_DMSNR_Curve) + s(Skewness_DMSNR_Curve), data = training_data, family = binomial)

MARS

# library(earth)
# 
# # Fit the MARS model
# mars_model <- earth(Class ~ ., data = train)
# 
# 
# y_pred <- predict(mars_model, newdata = X_test, type = "response")
# 
# y_pred_binary <- ifelse(y_pred > 0.5, 1, 0)
# 
# # Evaluate the model
# accuracy <- mean(y_pred_binary == y_test)
# cat("Accuracy:", accuracy, "\n")
# 
# # Confusion matrix and classification report
# table(Predicted = y_pred_binary, Actual = y_test)
library(earth)
library(caret)
library(ggplot2)
rm(list = ls())

pulsar_data <- read.csv("Pulsar.csv", header = TRUE, sep = ",")


# Define the features (X) and the target (y)
X <- pulsar_data[, !(names(pulsar_data) %in% c("Class"))]
y <- pulsar_data$Class

# Split the data into training (80%) and testing (20%) sets
set.seed(0)
trainIndex <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[trainIndex, ]
X_test <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# Combine X and y for training
train_data <- data.frame(X_train, target = y_train)


# Initialize and fit the MARS model
mars_model <- earth(target ~ ., data = train_data)

# Predict on the test data
test_data <- data.frame(X_test)
y_pred_prob <- predict(mars_model, test_data)

y_pred <- ifelse(y_pred_prob > 0.5, 1, 0)

# Calculate the confusion matrix
conf_matrix <- confusionMatrix(as.factor(y_pred), as.factor(y_test))
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3234   55
##          1   13  277
##                                          
##                Accuracy : 0.981          
##                  95% CI : (0.976, 0.9852)
##     No Information Rate : 0.9072         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.8803         
##                                          
##  Mcnemar's Test P-Value : 6.627e-07      
##                                          
##             Sensitivity : 0.9960         
##             Specificity : 0.8343         
##          Pos Pred Value : 0.9833         
##          Neg Pred Value : 0.9552         
##              Prevalence : 0.9072         
##          Detection Rate : 0.9036         
##    Detection Prevalence : 0.9190         
##       Balanced Accuracy : 0.9152         
##                                          
##        'Positive' Class : 0              
## 
# Extract performance metrics
accuracy <- conf_matrix$overall['Accuracy']
sensitivity <- conf_matrix$byClass['Sensitivity']
specificity <- conf_matrix$byClass['Specificity']
precision <- conf_matrix$byClass['Precision']
f1 <- conf_matrix$byClass['F1']

# Create a data frame to store the metrics
metrics <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Precision", "F1 Score"),
  Value = c(accuracy, sensitivity, specificity, precision, f1)
)

# Print the metrics table
print(metrics)
##                  Metric     Value
## Accuracy       Accuracy 0.9810003
## Sensitivity Sensitivity 0.9959963
## Specificity Specificity 0.8343373
## Precision     Precision 0.9832776
## F1             F1 Score 0.9895961
# Create the results data frame
results <- data.frame(Actual = y_test, Predicted_Prob = y_pred_prob)

# Print the structure and head of the results data frame
str(results)
## 'data.frame':    3579 obs. of  2 variables:
##  $ Actual: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ target: num  -0.01737 -0.02365 0.00249 -0.01425 0.04996 ...
head(results)
##   Actual      target
## 1      0 -0.01737459
## 2      0 -0.02364795
## 3      0  0.00248694
## 4      0 -0.01424527
## 5      0  0.04996036
## 6      0 -0.02077186
# # Visualize the results using ggplot2
# ggplot(results, aes(x = Actual, y = Predicted_Prob)) +
#   geom_point(alpha = 0.5) +
#   geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
#   labs(title = "Actual vs Predicted Probabilities", x = "Actual values", y = "Predicted probabilities") +
#   theme_minimal()

Question 4: Wavelets

library(fds)
library(wavelets)
library(glmnet)
library(ggplot2)
library(wavethresh)
library(tidyr)

rm(list = ls())

data(aa)
data(ao)

aa_data <- aa$y
ao_data <- ao$y

X <- rbind(aa_data, ao_data)
y <- c(rep(0, nrow(aa_data)), rep(1, nrow(ao_data)))

# Function to apply wavelet transform
apply_wavelet <- function(data, n_coef = 256) {
  next_power_of_2 <- 2^ceiling(log2(length(data)))
  padded_data <- c(data, rep(0, next_power_of_2 - length(data)))
  
  wt <- wd(padded_data, filter.number = 10, family = "DaubLeAsymm")
  all_levels <- 0:(nlevelsWT(wt) - 1)
  all_coefs <- unlist(lapply(all_levels, function(l) accessD(wt, level = l)))
  
  return(all_coefs[1:min(length(all_coefs), n_coef)])
}

# Apply wavelet transform to each row
X_wavelet <- t(apply(X, 1, apply_wavelet))

# Plot a sample of signals and their corresponding wavelet coefficients
plot_samples <- function(data, wavelet_data, title_signal, title_wavelet, sample_size = 10) {
  sampled_indices <- sample(nrow(data), sample_size)
  sampled_data <- data[sampled_indices, ]
  sampled_wavelet_data <- wavelet_data[sampled_indices, ]
  
  # Plot signals
  signal_data <- as.data.frame(t(sampled_data))
  signal_data$Index <- 1:nrow(signal_data)
  signal_data_long <- pivot_longer(signal_data, -Index, names_to = "Sample", values_to = "Amplitude")
  
  plot_signal <- ggplot(signal_data_long, aes(x = Index, y = Amplitude, color = Sample)) +
    geom_line(alpha = 0.5) +
    labs(title = title_signal, x = "Index", y = "Amplitude") +
    theme_minimal()
  
  # Plot wavelet coefficients
  coeff_data <- as.data.frame(t(sampled_wavelet_data))
  coeff_data$Index <- 1:nrow(coeff_data)
  coeff_data_long <- pivot_longer(coeff_data, -Index, names_to = "Sample", values_to = "Coefficient")
  
  plot_wavelet <- ggplot(coeff_data_long, aes(x = Index, y = Coefficient, color = Sample)) +
    geom_line(alpha = 0.5) +
    labs(title = title_wavelet, x = "Coefficient Index", y = "Coefficient Value") +
    theme_minimal()
  
  return(list(signal_plot = plot_signal, wavelet_plot = plot_wavelet))
}

# Plot for 'aa' class
aa_plots <- plot_samples(aa_data, t(apply(aa_data, 1, apply_wavelet)), 
                         "Sample of Signals from 'aa' Class", 
                         "Sample of Wavelet Coefficients for 'aa' Class")
print(aa_plots$signal_plot)

print(aa_plots$wavelet_plot)

# Plot for 'ao' class
ao_plots <- plot_samples(ao_data, t(apply(ao_data, 1, apply_wavelet)), 
                         "Sample of Signals from 'ao' Class", 
                         "Sample of Wavelet Coefficients for 'ao' Class")
print(ao_plots$signal_plot)

print(ao_plots$wavelet_plot)

# Split data
set.seed(123)
total_samples <- nrow(X_wavelet)
train_size <- floor(0.7 * total_samples)
train_indices <- sample(1:total_samples, train_size)

X_train <- X_wavelet[train_indices, ]
X_test <- X_wavelet[-train_indices, ]
y_train <- y[train_indices]
y_test <- y[-train_indices]

# Fit the model
cv_fit <- cv.glmnet(X_train, y_train, family = "binomial", alpha = 1)
best_lambda <- cv_fit$lambda.min
model <- glmnet(X_train, y_train, family = "binomial", alpha = 1, lambda = best_lambda)


# Make predictions
predictions <- predict(model, X_test, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(predicted_classes == y_test)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 1"
# Create confusion matrix
conf_matrix <- table(Predicted = predicted_classes, Actual = y_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##          Actual
## Predicted  0  1
##         0 43  0
##         1  0 47
# Calculate evaluation metrics
precision <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[2, 1])
recall <- conf_matrix[2, 2] / (conf_matrix[2, 2] + conf_matrix[1, 2])
f1_score <- 2 * precision * recall / (precision + recall)

metrics <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1 Score"),
  Value = c(accuracy, precision, recall, f1_score)
)

print(metrics)
##      Metric Value
## 1  Accuracy     1
## 2 Precision     1
## 3    Recall     1
## 4  F1 Score     1
# Get feature importances
coef_importance <- abs(coef(model)[-1])
top_features <- order(coef_importance, decreasing = TRUE)[1:10]

print("Top 10 most important wavelet coefficients:")
## [1] "Top 10 most important wavelet coefficients:"
print(top_features)
##  [1]  83  19 144  88  12  75 204 142 104  37
# Plot feature importances
ggplot(data.frame(Index = 1:length(coef_importance), Importance = coef_importance), 
       aes(x = Index, y = Importance)) +
  geom_point() +
  geom_point(data = data.frame(Index = top_features, Importance = coef_importance[top_features]), 
             aes(x = Index, y = Importance), color = "red", size = 3) +
  labs(title = "Wavelet Coefficient Importances", x = "Coefficient Index", y = "Absolute Coefficient Value") +
  theme_minimal()

# Extract and plot the non-zero coefficients
non_zero_coeffs <- coef(model)[coef(model) != 0]
non_zero_indices <- which(coef(model) != 0)

# Create a data frame for plotting
coeff_data <- data.frame(Index = non_zero_indices, Coefficient = non_zero_coeffs)

# Plot the non-zero coefficients
ggplot(coeff_data, aes(x = Index, y = Coefficient)) +
  geom_bar(stat = "identity") +
  labs(title = "Non-zero Coefficients from Penalized Logistic Regression",
       x = "Wavelet Coefficient Index",
       y = "Coefficient Value") +
  theme_minimal()

Question 5: Functional Data Analysis

library(gamair)
library(ggplot2)
library(fda)

rm(list = ls())
data(gas)


nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) = colnames(gas$NIR)
nir_spectra <- t(nir_spectra)  # Transpose to have 401 rows and 60 columns
octane <- gas$octane

# Define the basis
nir_basis <- create.bspline.basis(rangeval = c(900, 1700), nbasis = 20)

# Convert the spectra to functional data object
nir_fd <- smooth.basis(argvals = seq(900, 1700, by = 2), y = nir_spectra, fdParobj = nir_basis)$fd


plot(nir_fd, main = "NIR Spectra", xlab = "Wavelength (nm)", ylab = "log(1/R)")

## [1] "done"
# Get the coefficients of the functional data object
nir_fd_coefs <- t(eval.fd(seq(900, 1700, by = 2), nir_fd))

# Fit a linear model using the coefficients
lm_model <- lm(octane ~ nir_fd_coefs)

summary(lm_model)
## 
## Call:
## lm(formula = octane ~ nir_fd_coefs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7141 -0.7166  0.1786  0.7055  1.9047 
## 
## Coefficients: (374 not defined because of singularities)
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.750e+01  3.831e-01 228.394   <2e-16 ***
## nir_fd_coefs1    1.044e+16  8.651e+15   1.207   0.2363    
## nir_fd_coefs2   -1.499e+16  1.778e+16  -0.843   0.4052    
## nir_fd_coefs3   -2.618e+15  1.643e+16  -0.159   0.8744    
## nir_fd_coefs4    7.115e+15  8.911e+15   0.798   0.4305    
## nir_fd_coefs5           NA         NA      NA       NA    
## nir_fd_coefs6           NA         NA      NA       NA    
## nir_fd_coefs7           NA         NA      NA       NA    
## nir_fd_coefs8           NA         NA      NA       NA    
## nir_fd_coefs9           NA         NA      NA       NA    
## nir_fd_coefs10          NA         NA      NA       NA    
## nir_fd_coefs11          NA         NA      NA       NA    
## nir_fd_coefs12          NA         NA      NA       NA    
## nir_fd_coefs13          NA         NA      NA       NA    
## nir_fd_coefs14          NA         NA      NA       NA    
## nir_fd_coefs15          NA         NA      NA       NA    
## nir_fd_coefs16          NA         NA      NA       NA    
## nir_fd_coefs17          NA         NA      NA       NA    
## nir_fd_coefs18          NA         NA      NA       NA    
## nir_fd_coefs19          NA         NA      NA       NA    
## nir_fd_coefs20          NA         NA      NA       NA    
## nir_fd_coefs21          NA         NA      NA       NA    
## nir_fd_coefs22          NA         NA      NA       NA    
## nir_fd_coefs23          NA         NA      NA       NA    
## nir_fd_coefs24          NA         NA      NA       NA    
## nir_fd_coefs25          NA         NA      NA       NA    
## nir_fd_coefs26   7.297e+14  7.204e+14   1.013   0.3187    
## nir_fd_coefs27          NA         NA      NA       NA    
## nir_fd_coefs28          NA         NA      NA       NA    
## nir_fd_coefs29          NA         NA      NA       NA    
## nir_fd_coefs30          NA         NA      NA       NA    
## nir_fd_coefs31          NA         NA      NA       NA    
## nir_fd_coefs32          NA         NA      NA       NA    
## nir_fd_coefs33          NA         NA      NA       NA    
## nir_fd_coefs34          NA         NA      NA       NA    
## nir_fd_coefs35          NA         NA      NA       NA    
## nir_fd_coefs36          NA         NA      NA       NA    
## nir_fd_coefs37          NA         NA      NA       NA    
## nir_fd_coefs38          NA         NA      NA       NA    
## nir_fd_coefs39          NA         NA      NA       NA    
## nir_fd_coefs40          NA         NA      NA       NA    
## nir_fd_coefs41          NA         NA      NA       NA    
## nir_fd_coefs42          NA         NA      NA       NA    
## nir_fd_coefs43          NA         NA      NA       NA    
## nir_fd_coefs44          NA         NA      NA       NA    
## nir_fd_coefs45          NA         NA      NA       NA    
## nir_fd_coefs46          NA         NA      NA       NA    
## nir_fd_coefs47          NA         NA      NA       NA    
## nir_fd_coefs48          NA         NA      NA       NA    
## nir_fd_coefs49  -1.560e+16  1.230e+16  -1.268   0.2139    
## nir_fd_coefs50          NA         NA      NA       NA    
## nir_fd_coefs51   1.588e+16  1.251e+16   1.269   0.2135    
## nir_fd_coefs52          NA         NA      NA       NA    
## nir_fd_coefs53          NA         NA      NA       NA    
## nir_fd_coefs54          NA         NA      NA       NA    
## nir_fd_coefs55          NA         NA      NA       NA    
## nir_fd_coefs56          NA         NA      NA       NA    
## nir_fd_coefs57          NA         NA      NA       NA    
## nir_fd_coefs58          NA         NA      NA       NA    
## nir_fd_coefs59          NA         NA      NA       NA    
## nir_fd_coefs60          NA         NA      NA       NA    
## nir_fd_coefs61          NA         NA      NA       NA    
## nir_fd_coefs62          NA         NA      NA       NA    
## nir_fd_coefs63          NA         NA      NA       NA    
## nir_fd_coefs64          NA         NA      NA       NA    
## nir_fd_coefs65          NA         NA      NA       NA    
## nir_fd_coefs66          NA         NA      NA       NA    
## nir_fd_coefs67          NA         NA      NA       NA    
## nir_fd_coefs68          NA         NA      NA       NA    
## nir_fd_coefs69          NA         NA      NA       NA    
## nir_fd_coefs70          NA         NA      NA       NA    
## nir_fd_coefs71          NA         NA      NA       NA    
## nir_fd_coefs72          NA         NA      NA       NA    
## nir_fd_coefs73  -9.198e+14  9.519e+14  -0.966   0.3411    
## nir_fd_coefs74          NA         NA      NA       NA    
## nir_fd_coefs75          NA         NA      NA       NA    
## nir_fd_coefs76          NA         NA      NA       NA    
## nir_fd_coefs77          NA         NA      NA       NA    
## nir_fd_coefs78          NA         NA      NA       NA    
## nir_fd_coefs79          NA         NA      NA       NA    
## nir_fd_coefs80          NA         NA      NA       NA    
## nir_fd_coefs81          NA         NA      NA       NA    
## nir_fd_coefs82          NA         NA      NA       NA    
## nir_fd_coefs83          NA         NA      NA       NA    
## nir_fd_coefs84          NA         NA      NA       NA    
## nir_fd_coefs85          NA         NA      NA       NA    
## nir_fd_coefs86          NA         NA      NA       NA    
## nir_fd_coefs87          NA         NA      NA       NA    
## nir_fd_coefs88          NA         NA      NA       NA    
## nir_fd_coefs89          NA         NA      NA       NA    
## nir_fd_coefs90          NA         NA      NA       NA    
## nir_fd_coefs91          NA         NA      NA       NA    
## nir_fd_coefs92          NA         NA      NA       NA    
## nir_fd_coefs93          NA         NA      NA       NA    
## nir_fd_coefs94          NA         NA      NA       NA    
## nir_fd_coefs95          NA         NA      NA       NA    
## nir_fd_coefs96  -1.701e+16  1.564e+16  -1.088   0.2848    
## nir_fd_coefs97   1.783e+16  1.587e+16   1.124   0.2695    
## nir_fd_coefs98          NA         NA      NA       NA    
## nir_fd_coefs99          NA         NA      NA       NA    
## nir_fd_coefs100         NA         NA      NA       NA    
## nir_fd_coefs101         NA         NA      NA       NA    
## nir_fd_coefs102         NA         NA      NA       NA    
## nir_fd_coefs103         NA         NA      NA       NA    
## nir_fd_coefs104         NA         NA      NA       NA    
## nir_fd_coefs105         NA         NA      NA       NA    
## nir_fd_coefs106         NA         NA      NA       NA    
## nir_fd_coefs107         NA         NA      NA       NA    
## nir_fd_coefs108         NA         NA      NA       NA    
## nir_fd_coefs109         NA         NA      NA       NA    
## nir_fd_coefs110         NA         NA      NA       NA    
## nir_fd_coefs111         NA         NA      NA       NA    
## nir_fd_coefs112         NA         NA      NA       NA    
## nir_fd_coefs113         NA         NA      NA       NA    
## nir_fd_coefs114         NA         NA      NA       NA    
## nir_fd_coefs115         NA         NA      NA       NA    
## nir_fd_coefs116         NA         NA      NA       NA    
## nir_fd_coefs117         NA         NA      NA       NA    
## nir_fd_coefs118         NA         NA      NA       NA    
## nir_fd_coefs119         NA         NA      NA       NA    
## nir_fd_coefs120 -2.020e+15  8.786e+14  -2.299   0.0282 *  
## nir_fd_coefs121         NA         NA      NA       NA    
## nir_fd_coefs122         NA         NA      NA       NA    
## nir_fd_coefs123         NA         NA      NA       NA    
## nir_fd_coefs124         NA         NA      NA       NA    
## nir_fd_coefs125         NA         NA      NA       NA    
## nir_fd_coefs126         NA         NA      NA       NA    
## nir_fd_coefs127         NA         NA      NA       NA    
## nir_fd_coefs128         NA         NA      NA       NA    
## nir_fd_coefs129         NA         NA      NA       NA    
## nir_fd_coefs130         NA         NA      NA       NA    
## nir_fd_coefs131         NA         NA      NA       NA    
## nir_fd_coefs132         NA         NA      NA       NA    
## nir_fd_coefs133         NA         NA      NA       NA    
## nir_fd_coefs134         NA         NA      NA       NA    
## nir_fd_coefs135         NA         NA      NA       NA    
## nir_fd_coefs136         NA         NA      NA       NA    
## nir_fd_coefs137         NA         NA      NA       NA    
## nir_fd_coefs138         NA         NA      NA       NA    
## nir_fd_coefs139         NA         NA      NA       NA    
## nir_fd_coefs140         NA         NA      NA       NA    
## nir_fd_coefs141         NA         NA      NA       NA    
## nir_fd_coefs142         NA         NA      NA       NA    
## nir_fd_coefs143         NA         NA      NA       NA    
## nir_fd_coefs144  1.697e+16  9.796e+15   1.732   0.0928 .  
## nir_fd_coefs145         NA         NA      NA       NA    
## nir_fd_coefs146         NA         NA      NA       NA    
## nir_fd_coefs147 -1.733e+16  1.026e+16  -1.690   0.1007    
## nir_fd_coefs148         NA         NA      NA       NA    
## nir_fd_coefs149         NA         NA      NA       NA    
## nir_fd_coefs150         NA         NA      NA       NA    
## nir_fd_coefs151         NA         NA      NA       NA    
## nir_fd_coefs152         NA         NA      NA       NA    
## nir_fd_coefs153         NA         NA      NA       NA    
## nir_fd_coefs154         NA         NA      NA       NA    
## nir_fd_coefs155         NA         NA      NA       NA    
## nir_fd_coefs156         NA         NA      NA       NA    
## nir_fd_coefs157         NA         NA      NA       NA    
## nir_fd_coefs158         NA         NA      NA       NA    
## nir_fd_coefs159         NA         NA      NA       NA    
## nir_fd_coefs160         NA         NA      NA       NA    
## nir_fd_coefs161         NA         NA      NA       NA    
## nir_fd_coefs162         NA         NA      NA       NA    
## nir_fd_coefs163         NA         NA      NA       NA    
## nir_fd_coefs164         NA         NA      NA       NA    
## nir_fd_coefs165         NA         NA      NA       NA    
## nir_fd_coefs166         NA         NA      NA       NA    
## nir_fd_coefs167  5.949e+14  2.125e+15   0.280   0.7813    
## nir_fd_coefs168         NA         NA      NA       NA    
## nir_fd_coefs169         NA         NA      NA       NA    
## nir_fd_coefs170         NA         NA      NA       NA    
## nir_fd_coefs171         NA         NA      NA       NA    
## nir_fd_coefs172         NA         NA      NA       NA    
## nir_fd_coefs173         NA         NA      NA       NA    
## nir_fd_coefs174         NA         NA      NA       NA    
## nir_fd_coefs175         NA         NA      NA       NA    
## nir_fd_coefs176         NA         NA      NA       NA    
## nir_fd_coefs177         NA         NA      NA       NA    
## nir_fd_coefs178         NA         NA      NA       NA    
## nir_fd_coefs179         NA         NA      NA       NA    
## nir_fd_coefs180         NA         NA      NA       NA    
## nir_fd_coefs181         NA         NA      NA       NA    
## nir_fd_coefs182         NA         NA      NA       NA    
## nir_fd_coefs183         NA         NA      NA       NA    
## nir_fd_coefs184         NA         NA      NA       NA    
## nir_fd_coefs185         NA         NA      NA       NA    
## nir_fd_coefs186         NA         NA      NA       NA    
## nir_fd_coefs187         NA         NA      NA       NA    
## nir_fd_coefs188         NA         NA      NA       NA    
## nir_fd_coefs189         NA         NA      NA       NA    
## nir_fd_coefs190         NA         NA      NA       NA    
## nir_fd_coefs191  1.466e+16  1.835e+16   0.799   0.4302    
## nir_fd_coefs192         NA         NA      NA       NA    
## nir_fd_coefs193         NA         NA      NA       NA    
## nir_fd_coefs194         NA         NA      NA       NA    
## nir_fd_coefs195         NA         NA      NA       NA    
## nir_fd_coefs196         NA         NA      NA       NA    
## nir_fd_coefs197 -1.807e+16  2.204e+16  -0.820   0.4184    
## nir_fd_coefs198         NA         NA      NA       NA    
## nir_fd_coefs199         NA         NA      NA       NA    
## nir_fd_coefs200         NA         NA      NA       NA    
## nir_fd_coefs201         NA         NA      NA       NA    
## nir_fd_coefs202         NA         NA      NA       NA    
## nir_fd_coefs203         NA         NA      NA       NA    
## nir_fd_coefs204         NA         NA      NA       NA    
## nir_fd_coefs205         NA         NA      NA       NA    
## nir_fd_coefs206         NA         NA      NA       NA    
## nir_fd_coefs207         NA         NA      NA       NA    
## nir_fd_coefs208         NA         NA      NA       NA    
## nir_fd_coefs209         NA         NA      NA       NA    
## nir_fd_coefs210         NA         NA      NA       NA    
## nir_fd_coefs211         NA         NA      NA       NA    
## nir_fd_coefs212         NA         NA      NA       NA    
## nir_fd_coefs213         NA         NA      NA       NA    
## nir_fd_coefs214  4.365e+15  6.249e+15   0.698   0.4900    
## nir_fd_coefs215         NA         NA      NA       NA    
## nir_fd_coefs216         NA         NA      NA       NA    
## nir_fd_coefs217         NA         NA      NA       NA    
## nir_fd_coefs218         NA         NA      NA       NA    
## nir_fd_coefs219         NA         NA      NA       NA    
## nir_fd_coefs220         NA         NA      NA       NA    
## nir_fd_coefs221         NA         NA      NA       NA    
## nir_fd_coefs222         NA         NA      NA       NA    
## nir_fd_coefs223         NA         NA      NA       NA    
## nir_fd_coefs224         NA         NA      NA       NA    
## nir_fd_coefs225         NA         NA      NA       NA    
## nir_fd_coefs226         NA         NA      NA       NA    
## nir_fd_coefs227         NA         NA      NA       NA    
## nir_fd_coefs228         NA         NA      NA       NA    
## nir_fd_coefs229         NA         NA      NA       NA    
## nir_fd_coefs230         NA         NA      NA       NA    
## nir_fd_coefs231         NA         NA      NA       NA    
## nir_fd_coefs232         NA         NA      NA       NA    
## nir_fd_coefs233         NA         NA      NA       NA    
## nir_fd_coefs234         NA         NA      NA       NA    
## nir_fd_coefs235         NA         NA      NA       NA    
## nir_fd_coefs236         NA         NA      NA       NA    
## nir_fd_coefs237         NA         NA      NA       NA    
## nir_fd_coefs238  7.822e+15  4.979e+15   1.571   0.1260    
## nir_fd_coefs239         NA         NA      NA       NA    
## nir_fd_coefs240         NA         NA      NA       NA    
## nir_fd_coefs241         NA         NA      NA       NA    
## nir_fd_coefs242         NA         NA      NA       NA    
## nir_fd_coefs243         NA         NA      NA       NA    
## nir_fd_coefs244         NA         NA      NA       NA    
## nir_fd_coefs245         NA         NA      NA       NA    
## nir_fd_coefs246         NA         NA      NA       NA    
## nir_fd_coefs247         NA         NA      NA       NA    
## nir_fd_coefs248 -1.494e+16  9.842e+15  -1.518   0.1389    
## nir_fd_coefs249         NA         NA      NA       NA    
## nir_fd_coefs250         NA         NA      NA       NA    
## nir_fd_coefs251         NA         NA      NA       NA    
## nir_fd_coefs252         NA         NA      NA       NA    
## nir_fd_coefs253         NA         NA      NA       NA    
## nir_fd_coefs254         NA         NA      NA       NA    
## nir_fd_coefs255         NA         NA      NA       NA    
## nir_fd_coefs256         NA         NA      NA       NA    
## nir_fd_coefs257         NA         NA      NA       NA    
## nir_fd_coefs258         NA         NA      NA       NA    
## nir_fd_coefs259         NA         NA      NA       NA    
## nir_fd_coefs260         NA         NA      NA       NA    
## nir_fd_coefs261  8.855e+15  5.970e+15   1.483   0.1478    
## nir_fd_coefs262         NA         NA      NA       NA    
## nir_fd_coefs263         NA         NA      NA       NA    
## nir_fd_coefs264         NA         NA      NA       NA    
## nir_fd_coefs265         NA         NA      NA       NA    
## nir_fd_coefs266         NA         NA      NA       NA    
## nir_fd_coefs267         NA         NA      NA       NA    
## nir_fd_coefs268         NA         NA      NA       NA    
## nir_fd_coefs269         NA         NA      NA       NA    
## nir_fd_coefs270         NA         NA      NA       NA    
## nir_fd_coefs271         NA         NA      NA       NA    
## nir_fd_coefs272         NA         NA      NA       NA    
## nir_fd_coefs273         NA         NA      NA       NA    
## nir_fd_coefs274         NA         NA      NA       NA    
## nir_fd_coefs275         NA         NA      NA       NA    
## nir_fd_coefs276         NA         NA      NA       NA    
## nir_fd_coefs277         NA         NA      NA       NA    
## nir_fd_coefs278         NA         NA      NA       NA    
## nir_fd_coefs279         NA         NA      NA       NA    
## nir_fd_coefs280         NA         NA      NA       NA    
## nir_fd_coefs281         NA         NA      NA       NA    
## nir_fd_coefs282         NA         NA      NA       NA    
## nir_fd_coefs283         NA         NA      NA       NA    
## nir_fd_coefs284         NA         NA      NA       NA    
## nir_fd_coefs285 -2.659e+15  1.803e+15  -1.475   0.1500    
## nir_fd_coefs286         NA         NA      NA       NA    
## nir_fd_coefs287         NA         NA      NA       NA    
## nir_fd_coefs288         NA         NA      NA       NA    
## nir_fd_coefs289         NA         NA      NA       NA    
## nir_fd_coefs290         NA         NA      NA       NA    
## nir_fd_coefs291         NA         NA      NA       NA    
## nir_fd_coefs292         NA         NA      NA       NA    
## nir_fd_coefs293         NA         NA      NA       NA    
## nir_fd_coefs294         NA         NA      NA       NA    
## nir_fd_coefs295         NA         NA      NA       NA    
## nir_fd_coefs296         NA         NA      NA       NA    
## nir_fd_coefs297         NA         NA      NA       NA    
## nir_fd_coefs298         NA         NA      NA       NA    
## nir_fd_coefs299         NA         NA      NA       NA    
## nir_fd_coefs300         NA         NA      NA       NA    
## nir_fd_coefs301         NA         NA      NA       NA    
## nir_fd_coefs302  1.592e+15  1.080e+15   1.475   0.1500    
## nir_fd_coefs303         NA         NA      NA       NA    
## nir_fd_coefs304         NA         NA      NA       NA    
## nir_fd_coefs305         NA         NA      NA       NA    
## nir_fd_coefs306         NA         NA      NA       NA    
## nir_fd_coefs307         NA         NA      NA       NA    
## nir_fd_coefs308 -6.892e+14  4.673e+14  -1.475   0.1500    
## nir_fd_coefs309         NA         NA      NA       NA    
## nir_fd_coefs310         NA         NA      NA       NA    
## nir_fd_coefs311         NA         NA      NA       NA    
## nir_fd_coefs312         NA         NA      NA       NA    
## nir_fd_coefs313         NA         NA      NA       NA    
## nir_fd_coefs314         NA         NA      NA       NA    
## nir_fd_coefs315         NA         NA      NA       NA    
## nir_fd_coefs316         NA         NA      NA       NA    
## nir_fd_coefs317         NA         NA      NA       NA    
## nir_fd_coefs318         NA         NA      NA       NA    
## nir_fd_coefs319         NA         NA      NA       NA    
## nir_fd_coefs320         NA         NA      NA       NA    
## nir_fd_coefs321         NA         NA      NA       NA    
## nir_fd_coefs322         NA         NA      NA       NA    
## nir_fd_coefs323         NA         NA      NA       NA    
## nir_fd_coefs324         NA         NA      NA       NA    
## nir_fd_coefs325         NA         NA      NA       NA    
## nir_fd_coefs326         NA         NA      NA       NA    
## nir_fd_coefs327         NA         NA      NA       NA    
## nir_fd_coefs328         NA         NA      NA       NA    
## nir_fd_coefs329         NA         NA      NA       NA    
## nir_fd_coefs330         NA         NA      NA       NA    
## nir_fd_coefs331         NA         NA      NA       NA    
## nir_fd_coefs332  6.192e+10  4.079e+10   1.518   0.1388    
## nir_fd_coefs333         NA         NA      NA       NA    
## nir_fd_coefs334         NA         NA      NA       NA    
## nir_fd_coefs335         NA         NA      NA       NA    
## nir_fd_coefs336         NA         NA      NA       NA    
## nir_fd_coefs337         NA         NA      NA       NA    
## nir_fd_coefs338         NA         NA      NA       NA    
## nir_fd_coefs339         NA         NA      NA       NA    
## nir_fd_coefs340         NA         NA      NA       NA    
## nir_fd_coefs341         NA         NA      NA       NA    
## nir_fd_coefs342         NA         NA      NA       NA    
## nir_fd_coefs343         NA         NA      NA       NA    
## nir_fd_coefs344         NA         NA      NA       NA    
## nir_fd_coefs345         NA         NA      NA       NA    
## nir_fd_coefs346         NA         NA      NA       NA    
## nir_fd_coefs347         NA         NA      NA       NA    
## nir_fd_coefs348         NA         NA      NA       NA    
## nir_fd_coefs349         NA         NA      NA       NA    
## nir_fd_coefs350         NA         NA      NA       NA    
## nir_fd_coefs351         NA         NA      NA       NA    
## nir_fd_coefs352         NA         NA      NA       NA    
## nir_fd_coefs353         NA         NA      NA       NA    
## nir_fd_coefs354         NA         NA      NA       NA    
## nir_fd_coefs355 -2.839e+09  1.094e+10  -0.260   0.7968    
## nir_fd_coefs356  2.513e+09  9.736e+09   0.258   0.7980    
## nir_fd_coefs357         NA         NA      NA       NA    
## nir_fd_coefs358         NA         NA      NA       NA    
## nir_fd_coefs359         NA         NA      NA       NA    
## nir_fd_coefs360         NA         NA      NA       NA    
## nir_fd_coefs361         NA         NA      NA       NA    
## nir_fd_coefs362         NA         NA      NA       NA    
## nir_fd_coefs363         NA         NA      NA       NA    
## nir_fd_coefs364         NA         NA      NA       NA    
## nir_fd_coefs365         NA         NA      NA       NA    
## nir_fd_coefs366         NA         NA      NA       NA    
## nir_fd_coefs367         NA         NA      NA       NA    
## nir_fd_coefs368         NA         NA      NA       NA    
## nir_fd_coefs369         NA         NA      NA       NA    
## nir_fd_coefs370         NA         NA      NA       NA    
## nir_fd_coefs371         NA         NA      NA       NA    
## nir_fd_coefs372         NA         NA      NA       NA    
## nir_fd_coefs373         NA         NA      NA       NA    
## nir_fd_coefs374         NA         NA      NA       NA    
## nir_fd_coefs375         NA         NA      NA       NA    
## nir_fd_coefs376         NA         NA      NA       NA    
## nir_fd_coefs377         NA         NA      NA       NA    
## nir_fd_coefs378 -1.332e+06  5.170e+06  -0.258   0.7983    
## nir_fd_coefs379         NA         NA      NA       NA    
## nir_fd_coefs380         NA         NA      NA       NA    
## nir_fd_coefs381         NA         NA      NA       NA    
## nir_fd_coefs382         NA         NA      NA       NA    
## nir_fd_coefs383         NA         NA      NA       NA    
## nir_fd_coefs384         NA         NA      NA       NA    
## nir_fd_coefs385         NA         NA      NA       NA    
## nir_fd_coefs386         NA         NA      NA       NA    
## nir_fd_coefs387         NA         NA      NA       NA    
## nir_fd_coefs388         NA         NA      NA       NA    
## nir_fd_coefs389         NA         NA      NA       NA    
## nir_fd_coefs390         NA         NA      NA       NA    
## nir_fd_coefs391         NA         NA      NA       NA    
## nir_fd_coefs392         NA         NA      NA       NA    
## nir_fd_coefs393         NA         NA      NA       NA    
## nir_fd_coefs394         NA         NA      NA       NA    
## nir_fd_coefs395         NA         NA      NA       NA    
## nir_fd_coefs396         NA         NA      NA       NA    
## nir_fd_coefs397         NA         NA      NA       NA    
## nir_fd_coefs398         NA         NA      NA       NA    
## nir_fd_coefs399         NA         NA      NA       NA    
## nir_fd_coefs400         NA         NA      NA       NA    
## nir_fd_coefs401         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.391 on 32 degrees of freedom
## Multiple R-squared:  0.5519, Adjusted R-squared:  0.1739 
## F-statistic:  1.46 on 27 and 32 DF,  p-value: 0.1519
# Predict octane ratings
octane_pred <- predict(lm_model)

results <- data.frame(Actual = octane, Predicted = octane_pred)

# Plot actual vs predicted values using ggplot
ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point(color = 'blue') +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs Predicted Octane Ratings",
       x = "Actual Octane Rating",
       y = "Predicted Octane Rating") +
  theme_minimal()

coefficients <- coef(lm_model)

coef_data <- data.frame(Index = 1:length(coefficients), Coefficient = coefficients)

# Plot the coefficients using ggplot
ggplot(coef_data, aes(x = Index, y = Coefficient)) +
  geom_bar(stat = "identity") +
  labs(title = "Coefficients of the NIR Spectra",
       x = "Coefficient Index",
       y = "Coefficient Value") +
  theme_minimal()

library(gamair)
library(ggplot2)
library(splines)
rm(list = ls())

data(gas)

# Extract the spectra and octane ratings
nir_spectra <- matrix(as.numeric(unlist(gas$NIR)), nrow = 60, byrow = TRUE)
colnames(nir_spectra) <- colnames(gas$NIR)
nir_spectra <- t(nir_spectra)  # Transpose to have 401 rows and 60 columns
octane <- gas$octane

# Define the B-spline basis functions
basis_functions <- bs(seq(900, 1700, by = 2), df = 20, degree = 3, intercept = TRUE)


basis_expansion <- function(data, basis) {
  expanded_data <- matrix(0, nrow = ncol(basis), ncol = ncol(data))
  for (i in 1:ncol(data)) {
    expanded_data[, i] <- t(basis) %*% data[, i]
  }
  return(expanded_data)
}

# Apply basis expansion
nir_fd_coefs <- basis_expansion(nir_spectra, basis_functions)


nir_fd_coefs_df <- as.data.frame(t(nir_fd_coefs))
colnames(nir_fd_coefs_df) <- paste0("X", 1:ncol(nir_fd_coefs_df))

model_data <- cbind(octane = octane, nir_fd_coefs_df)

# Fit the linear model
formula <- as.formula(paste("octane ~", paste(colnames(nir_fd_coefs_df), collapse = " + "), "- 1"))
lm_model <- lm(formula, data = model_data)

summary(lm_model)
## 
## Call:
## lm(formula = formula, data = model_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21.37  13.12  54.24  81.60 130.43 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)  
## X1   -1907.9     1025.8  -1.860   0.0703 .
## X2    1149.6     1948.9   0.590   0.5586  
## X3    -656.2     2360.8  -0.278   0.7825  
## X4     949.0     1831.7   0.518   0.6072  
## X5     570.8     2122.2   0.269   0.7893  
## X6   -1459.1     1701.9  -0.857   0.3964  
## X7    -190.4     2276.4  -0.084   0.9338  
## X8    -595.2     2231.0  -0.267   0.7910  
## X9    1507.7     2110.0   0.715   0.4790  
## X10    426.0     3048.5   0.140   0.8896  
## X11   1389.5     2558.7   0.543   0.5901  
## X12  -3365.9     2972.6  -1.132   0.2643  
## X13   -132.3     2817.8  -0.047   0.9628  
## X14   -351.4     2508.1  -0.140   0.8893  
## X15   4001.0     2626.5   1.523   0.1356  
## X16  -2283.2     2597.2  -0.879   0.3846  
## X17    744.2     2049.0   0.363   0.7184  
## X18  -2820.2     2314.7  -1.218   0.2302  
## X19   2217.7     1714.6   1.293   0.2033  
## X20    889.6      913.6   0.974   0.3360  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83.7 on 40 degrees of freedom
## Multiple R-squared:  0.3856, Adjusted R-squared:  0.07842 
## F-statistic: 1.255 on 20 and 40 DF,  p-value: 0.2637
# First principles prediction using the fitted coefficients
fitted_coefficients <- coef(lm_model)
octane_pred_manual <- as.matrix(nir_fd_coefs_df) %*% fitted_coefficients

results <- data.frame(Actual = octane, Predicted = octane_pred_manual)

# Plot actual vs predicted values
ggplot(results, aes(x = Actual, y = Predicted)) +
  geom_point(color = 'blue') +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Actual vs Predicted Octane Ratings",
       x = "Actual Octane Rating",
       y = "Predicted Octane Rating") +
  theme_minimal()

print(fitted_coefficients)
##         X1         X2         X3         X4         X5         X6         X7 
## -1907.9197  1149.6367  -656.1988   948.9994   570.8192 -1459.0755  -190.3914 
##         X8         X9        X10        X11        X12        X13        X14 
##  -595.2310  1507.6761   426.0391  1389.5247 -3365.8842  -132.2741  -351.4226 
##        X15        X16        X17        X18        X19        X20 
##  4000.9962 -2283.1757   744.1567 -2820.2456  2217.7276   889.6180
nir_spectra_df <- as.data.frame(nir_spectra)
nir_spectra_df$Wavelength <- seq(900, 1700, by = 2)

nir_spectra_long <- tidyr::pivot_longer(nir_spectra_df, cols = -Wavelength, names_to = "Sample", values_to = "log1R")

ggplot(nir_spectra_long, aes(x = Wavelength, y = log1R, group = Sample, color = Sample)) +
  geom_line(alpha = 0.5) +
  labs(title = "NIR Spectra",
       x = "Wavelength (nm)",
       y = "log(1/R)") +
  theme_minimal() +
  theme(legend.position = "none")